################################################################################
##
## Purpose: This script prepares and cleans the data on children gender.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/raw/demographics/demographics.csv: Prepped data from 0_DATA_demographics_scraper.R (not run as part of replication)
##    - ./data/prepped/demographics/toCheck_kids_jbmanual.csv: Manually prepared data from an output of this script (./data/prepped/demographics/toCheck_kids.csv)
##    - ./data/prepped/demographics/manual_children_AK_JB_NF.csv: Manually prepared data from an output of this script (./data/prepped/demographics/toCheck_kids_hard_ones.csv)
##    - ./data/prepped/demographics/demographics_clean.csv: Saved intermediate file from this script 
##  - Outputs:
##    - ./data/prepped/demographics/demographics_clean.csv
##    - ./data/prepped/demographics/toCheck_kids.csv
##    - ./data/prepped/demographics/toCheck_kids_hard_ones.csv
##    - ./data/prepped/demographics/child_gender_final.csv
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(gender)
require(tidyverse)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


# Load demographic data generated by 0_DATA_demographics_scraper.R
children <- read_csv('./data/raw/demographics/demographics.csv')

kidsGender <- NULL
for(i in 1:length(unique(children$child))) {
  unique(children$child)[i]
  kids <- gsub(' .*','',trimws(str_split(gsub('[A-Z]\\.|Jr\\.,|;','',gsub(';.*|Governor','',gsub('\\[\\[Page \\d+\\]\\]',' ',unique(children$child)[i]))),',| and ')[[1]]))
  if(any(grepl('Congress|House|Senat',kids))) { stop() }
  if(is.na(kids[1])) { next }
  if(any(kids == '')) {
    kids <- kids[-which(kids == '')]
  }
  tmp <- gender(kids)
  nKids <- length(kids)
  nDaughters <- nrow(tmp %>% filter(gender == 'female'))
  nSons <- nrow(tmp %>% filter(gender == 'male'))
  firstDaughter <- nrow(tmp %>% filter(name == kids[1],gender == 'female'))
  kidsGender <- bind_rows(kidsGender,as_tibble(data.frame(id = unique(children$child)[i],
                       nKids = nKids,nDaughters = nDaughters,nSons = nSons,firstDaughter = firstDaughter)))
  cat(kids,':',nKids,':',nDaughters,'\n')
}

finalChildren <- children %>%
  left_join(kidsGender,by = c('child' = 'id')) %>%
  select(-`...1`)

write.csv(finalChildren,file = './data/prepped/demographics/demographics_clean.csv')


# Continued spot checking
toCheck <- finalChildren %>%
  mutate(nKidsCheck = nDaughters + nSons) %>%
  filter(nKids != nKidsCheck | (is.na(child) & !is.na(fam))) %>%
  select(fam,child) %>% distinct()

write.csv(toCheck,file = './data/prepped/demographics/toCheck_kids.csv')

# Manually checked version, but there is still more to do
checked <- read_csv('./data/prepped/demographics/toCheck_kids_jbmanual.csv')

toCheck2 <- finalChildren %>% select(name,granId,fam,child) %>%
  left_join(checked %>% filter(DK == 1)) %>%
  filter(DK == 1) %>%
  select(name,granId) %>% distinct()

data.frame(t(as.matrix(unlist(str_split(gsub('CDIR-\\d{4}-\\d{2}-\\d{2}-','',toCheck2$granId[1:3]),'-')))))

toCheck2 <- toCheck2 %>%
  mutate(dist = gsub('CDIR-\\d{4}-\\d{2}-\\d{2}-','',granId)) %>%
  mutate(stab = substr(dist,1,2)) %>%
  mutate(dist = gsub('[A-Z]{2}-','',dist)) %>%
  select(name,stab,dist) %>% distinct() %>%
  group_by(name,stab) %>%
  summarise(dist = paste(dist,collapse = ', '),.groups = 'drop')

# Let's divide and conquer with the coauthors
write.csv(toCheck2,file = './data/prepped/demographics/toCheck_kids_hard_ones.csv')


# Final build
collab_checked <- read_csv('./data/prepped/demographics/manual_children_AK_JB_NF.csv')
jb_checked <- read_csv('./data/prepped/demographics/toCheck_kids_jbmanual.csv') %>%
  rename(X1 = `...1`)
full <- read_csv('./data/prepped/demographics/demographics_clean.csv') %>%
  rename(X1 = `...1`)

collab_cleaned <- NULL
for(i in 1:nrow(collab_checked)) {
  tmp <- str_split(collab_checked$dist[i],', ')[[1]]
  for(j in 1:length(tmp)) {
    collab_cleaned <- bind_rows(collab_cleaned,collab_checked[i,] %>% mutate(dist = tmp[j]))
  }
}
  
toMerge <- jb_checked %>%
  filter(is.na(DK)) %>%
  left_join(full %>% select(name,granId,fam,child)) %>%
  mutate(dist = gsub('CDIR-\\d{4}-\\d{2}-\\d{2}-','',granId)) %>%
  mutate(stab = substr(dist,1,2)) %>%
  mutate(dist = gsub('[A-Z]{2}-','',dist)) %>%
  select(-X1,-DK,-fam,-child) %>% distinct() %>%
  mutate(uniqID = paste0(name,dist,stab))

toMerge %>%
  mutate(year = as.numeric(str_extract(granId,'\\d{4}'))) %>%
  filter(grepl('FORTNEY',name)) %>%
  select(nKids,name,granId,year) %>%
  arrange(year)

collab_cleaned <- collab_cleaned %>%
  mutate(uniqID = paste0(name,dist,stab))

toMerge <- bind_rows(toMerge %>% filter(!uniqID %in% collab_cleaned$uniqID),
          collab_cleaned %>%
            mutate_at(vars(matches('^n(Kids|Daughters|Sons)|^first')),function(x) as.numeric(x)) %>%
            select(-Coder,-Comments,-X,-Source,-matches('12|13|14')))


fullTmp <- full %>% 
  mutate(dist = gsub('CDIR-\\d{4}-\\d{2}-\\d{2}-','',granId)) %>%
  mutate(stab = substr(dist,1,2)) %>%
  mutate(dist = gsub('[A-Z]{2}-','',dist)) %>%
  mutate(uniqID = paste0(name,dist,stab))


toMerge <- toMerge %>%
  left_join(fullTmp %>%
  select(granId,uniqID))

final <- fullTmp %>%
  filter(!uniqID %in% toMerge$uniqID) %>%
  select(colnames(toMerge)) %>%
  mutate(type = 'automated') %>%
  bind_rows(toMerge %>% mutate(type = 'manual')) %>%
  select(-uniqID) %>%
  mutate_at(vars(matches('n(Kids|Sons)|Daughter')),function(x) ifelse(is.na(x),0,x))

final %>%
  mutate(year = as.numeric(str_extract(granId,'\\d{4}'))) %>%
  select(-granId) %>%
  distinct() %>%
  group_by(name,dist,stab,year) %>%
  mutate(n=n()) %>%
  filter(n > 1) %>%
  arrange(year,name)

write.csv(final,file = './data/prepped/demographics/child_gender_final.csv')


# EOF